import codecs
import re
import pandas as pd
from os.path import join
import os
DATA_DIR = './'
TrainDataPath = join(DATA_DIR,'Train_Data.csv')
TestDataPath = join(DATA_DIR,'Test_Data.csv')

def clean_text(x):
    x = re.sub("\?{2,}", "", x)
    x = re.sub("\u3000", "", x)
    x = re.sub(" ", "", x)
    return x
def is_chinese(uchar):
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

def format_str(content):
    content_str = ''
    for i in content:
        if is_chinese(i):
            content_str = content_str + i
    return content_str




df=pd.read_csv(TrainDataPath)
df['context'] = "123"
for index in range(0,4999):
    if pd.isnull(df.loc[index,"text"]):
        df.loc[index,"context"] = format_str(clean_text(df.loc[index,"title"]))
    else:
        df.loc[index,"context"] = format_str(clean_text(df.loc[index,"text"]))
train = df.loc[:4000,['id','negative','context']]
dev = df.loc[4000:,['id','negative','context']]
train.to_csv("clean_train.csv",index=False,header=0)
dev.to_csv("clean_dev.csv",index=False,header=0)
"""train = df.loc[:4000,['id','title','text','negative']]
dev = df.loc[4000:,['id','title','text','negative']]
train['context'] = train['title'].map(str)+train['text'].map(str)
dev['context'] = dev['title'].map(str)+dev['text'].map(str)

del train['title']
del train['text']
del dev['title']
del dev['text']
train.to_csv("clean_train.csv",index=False,header=0)
dev.to_csv("clean_dev.csv",index=False,header=0)"""





